/*

DATE: June 18 2018

PROJECT: Police killings and mental health of black Americans, replication archive

PURPOSE: This file contains code to set up the datasets and for the estimates presented in:

-Table 1
-Table 2
-Fig 1

It also includes code for estimates displayed in eFig 1, eTable 1, eTable 2, eTable 3, eTable 4 of the Supplemental Appendix. The estimates used to 
generate Fig 2 and eTable 2 are in separate do files. 

*/

*FILEPATHS
global working "[FILE PATH]/Working Datasets/"
global source "[FILE PATH]/Source Datasets and Do Files/"

****CREATE EFIGURE 1*****
*this is a MAP OF POLICE SHOOTINGS (CUMULATIVE) by state
use "$working/black_unarmed_shootings.dta", clear
collapse (sum) count, by(stfip)
joinby stfip using "$source/abbtofip.dta", 
rename abbrev state
maptile count, geo(state)
clear

*****DATA SET UP*******
**MERGE DATA
use "$working/black_unarmed_shootings.dta", clear

*histogram of shootings per month
expand count

bys stfip: egen total_shot = total(count)
gen shot_per_month = total_shot/45
hist shot_per_month


joinby stfip using "$working/brfss_black_sample_June182018.dta", unmatched(master) _merge(_BRF)

*Generate time from shooting estimate for every shooting-individual pair in the data
*Combine these into 3 month intervals 
gen dist = int_date - shooting_date
gen timing = .

replace timing = 1 if dist>=0&dist<=90
replace timing = 2 if dist>90&dist<=180
replace timing = 3 if dist>180&dist<=270
replace timing = 4 if dist>270&dist<360


replace timing = -1 if dist<0&dist>=-90
replace timing = -2 if dist<=-91&dist>=-180
replace timing = -3 if dist<=-181&dist>=-270
replace timing = -4 if dist<=-271&dist>=-360

tab timing, gen(t_)  /*negative numbers reflect a police killing date AFTER interview; t_5 is the primary exposure: an indicator for a police killing in the 90 days before the interview date */

*Denote whether police killing occured in any of +/- 6 monthly bins around interview date
gen monthdist = .
replace monthdist = 1 if dist>=0&dist<=30
replace monthdist = 2 if dist>30&dist<=60
replace monthdist = 3 if dist>60&dist<=90
replace monthdist = 4 if dist>90&dist<=120
replace monthdist = 5 if dist>120&dist<=150
replace monthdist = 6 if dist>150&dist<=180

replace monthdist = -1 if dist<0&dist>=-30
replace monthdist = -2 if dist<-30&dist>=-60
replace monthdist = -3 if dist<-60&dist>=-90
replace monthdist = -4 if dist<-90&dist>=-120
replace monthdist = -5 if dist<-120&dist>=-150
replace monthdist = -6 if dist<-150&dist>=-180

tab monthdist, gen(m_)	/*m_7 is the immediate post period (month) */

*Collapse data to individual level
*This will yield a dataset where t_* and m_* are counts of number of shootings in each quarterly or monthly bin around the interview date

collapse (sum) t_* m_* (mean) _psu shot_per_month genhlth physhlth menthlth _age educa stfip imonth iyear iday _ll sex int_dat drnk _rfsm exerany income2, by(id)

*Generate a measure of exposure to any police killing in the 4 quarters around BRFSS interview
forvalues x = 1/8 {
	gen any_`x' = t_`x'
	recode any_`x' (1/max = 1)
	}
	
*Create additional mental health measures (secondary outcomes)
gen any_poorMH_day = menthlth
recode any_poorMH_day (1/max = 1)
gen freq_MH_days = menthlth
recode freq_MH_days (0/13 = 0) (14/max = 1)

*****GENERATE ESTIMATES FOR TABLE 1****
*This are the DESCRIPTIVE STATS
*Define estimation sample
xi: reghdfe menthlth t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
mark sample if e(sample)

*Outcomes
sum menthlth if sample==1 [aw = _llcpwt]

/* first command gives the raw number, second the weighted proportion*/
tab any_poorMH_day if sample==1 
tab any_poorMH_day if sample==1 [aw = _llcpwt]

tab freq_MH_days if sample==1
tab freq_MH_days if sample==1 [aw = _llcpwt]

*Exposures
sum t_5 if sample==1 [aw = _llcpwt]
tab any_5 if sample==1
tab any_5 if sample==1 [aw = _llcpwt]

*Covariates
tab sex if sample==1
tab sex if sample==1 [aw = _llcpwt]

gen age2 = _ageg
recode age2 (1/3 = 1) (4/9 = 2) (10/13 = 3) (14 =4)
tab age2 if sample==1
tab age2 if sample==1 [aw = _llcpwt]

gen college = educa
recode college (0/1 = 0) (2/3=1)
tab college if sample==1
tab college if sample==1 [aw = _llcpwt]

/* we did not use svy commands here, but doing so generates the same estimates: e.g.:
svyset _psu [pweight  = _llcpwt]
svy: mean menthlth if sample==1 
svy: mean t_5 if sample==1 
svy: prop college if sample==1
*/

******GENERATE ESTIMATES FOR TABLE 3*******
*These are the MAIN REGRESSION ESTIMATES
*We use the -REGHDFE- command in order to calculate more accurate S.E.s in the face of multiple levels of fixed effects
*See: http://scorreia.com/demo/reghdfe.html and https://www.fionaburlig.com/blog/2016/8/16/is-the-file-drawer-too-large 

*Col (1)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

*Col (2) - Poisson Estimates for main outcome
set matsize 1200
xi: poisson menthlth t_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr
xi: poisson menthlth any_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr

*Col (3) and (4)  - Poisson for secondary outcomes
xi: poisson any_poorMH_day t_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr
xi: poisson any_poorMH_day any_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr
xi: poisson freq_ t_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr
xi: poisson freq_ any_5 i.educa i._ageg i.iday i.sex i.stfip*i.imonth i.imonth*i.iyear [pw  = _llcpwt] , robust cluster(stfip) irr

/*
*Wild Cluster-Bootstrap T 
*NOTE:These are computationally very expensive and can only be run on linear version of models

*Col (1) and (2)
xi: reg menthlth any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)
xi: reg menthlth t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)

*Col (3) and (4)
xi: reg any_poorMH_day any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)
xi: reg any_poorMH_day t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)

xi: reg freq_ any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct any_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)
xi: reg freq_  t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 
xi: bootwildct t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa  , numvars(1) bootreps(50)

*/

/*
Note: We opted not to use svy commands in order to make use of the -REGHDFE- command (for reasons described above).
Our core results are unchanged using svy. For example compare the results from: 

svyset stfip [pweight  = _llcpwt]
set matsize 1200

xi: svy: reg menthlth t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa , 

To these:
xi: reg menthlth t_5 i.stfip*i.imonth i.iyear*i.imonth i.iday sex i._ageg5 i.educa [pw  = _llcpwt] , cluster(stfip) 

*/

*****GENERATE ESTIMATES FOR FIGURE 1****
*This is the EVENT STUDY MODEL with embedded FALSIFICATION TEST
xi: reghdfe menthlth m_1-m_12 [pw  = _llcpwt], abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

****GENERATE ESTIMATES FOR ETABLE 1****
*Sensitivity to adding additional covariates

*Add state-year FE
xi: reghdfe menthlth t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear stfip#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear stfip#iyear iday sex _ageg5yr educa) cluster(stfip)

*Add census div-year-month FE
*Need to bring in an additional datafile for this
joinby stfip using "$source/fiptocensusregion.dta", unmatched(master) _merge(_CENSUS)
gen double census_year = census*10000 + iyear
xi: reghdfe menthlth t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear census_year#imonth iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear census_year#imonth  iday sex _ageg5yr educa) cluster(stfip)

*Add income as a covariate
xi: reghdfe menthlth t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa income2) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa income2) cluster(stfip)

****GENERATE ESTIMATES FOR ETABLE 2****
*Selection into the sample based on observed characteristics

gen age_cont = _ageg5yr
recode age_cont (1 = 21) (2 = 27) (3 = 32) (4 = 37) (5 = 42) (6 = 47) (7 = 52) (8 = 57) (9 = 62) (10 = 67) (11 = 72) (12 = 77) (13 = 85) (14 = .)

foreach x of varlist sex college age_cont {
	xi: reghdfe `x' t_5 [pw  = _llcpwt] if iyear>2012, abs(stfip#imonth imonth#iyear iday) cluster(stfip)
	xi: reghdfe `x' any_5 [pw  = _llcpwt] if iyear>2012, abs(stfip#imonth imonth#iyear iday) cluster(stfip)
	}
	

****GENERATE ESTIMATES FOR ETABLE 3****
*Mental health consequences for one versus two or more police killings
gen dose_resp = t_5
recode dose_resp (0=0) (1=1) (2/max = 2)

xi: reghdfe menthlth i.dose_resp [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

****GENERATE ESTIMATES FOR ETABLE 4****
*Estimates by BRFSS participant demographic groups
*We first estimate the main models by each subgroup, and then a fully interacted model to assess p-value of difference between coefficients

*Sex (0 = men, 1 = women)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if sex==0, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if sex==0, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if sex==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if sex==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

gen stfip_sex = stfip*10 + sex
gen iyear_sex = iyear*10 + sex
xi: reghdfe menthlth i.sex*t_5 [pw  = _llcpwt] , abs(stfip_sex#imonth imonth#iyear_sex sex#iday sex#_ageg5yr sex#educa) cluster(stfip)
xi: reghdfe menthlth i.sex*any_5 [pw  = _llcpwt] , abs(stfip_sex#imonth imonth#iyear_sex sex#iday sex#_ageg5yr sex#educa) cluster(stfip)


*Age groups; 18-34, 35-49, 50-64, 65+
gen age_grp = _ageg
recode age_grp (1/3 = 1) (4/6 = 2) (7/9 = 3) (10/13 = 4)

xi: reghdfe menthlth t_5 [pw  = _llcpwt] if age_grp==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if age_grp==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if age_grp==2, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if age_grp==2, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if age_grp==3, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if age_grp==3, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if  age_grp==4, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if age_grp==4, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

gen stfip_age = stfip*10 + age_grp
gen iyear_age = iyear*10 + age_grp

xi: reghdfe menthlth i.age_grp*t_5 [pw  = _llcpwt] , abs(stfip_age#imonth imonth#iyear_age age_grp#iday sex#age_grp age_grp#educa) cluster(stfip)
test _IageXany_5_3 _IageXany_5_4 _IageXany_5_2
xi: reghdfe menthlth i.age_grp*any_5 [pw  = _llcpwt] , abs(stfip_age#imonth imonth#iyear_age age_grp#iday sex#age_grp age_grp#educa) cluster(stfip)
test _IageXany_5_3 _IageXany_5_4 _IageXany_5_2

*education (less than HS, HS, college+)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if educa==0, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if educa==0, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

xi: reghdfe menthlth t_5 [pw  = _llcpwt] if educa==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if educa==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

xi: reghdfe menthlth t_5 [pw  = _llcpwt] if college==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if college==1, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

gen educ_grp = educa
recode educ_grp (0=0) (1=1) (2/max = 2) 
gen stfip_educ = stfip*10 + educ_grp
gen iyear_educ = iyear*10 + educ_grp

xi: reghdfe menthlth i.educ_grp*t_5 [pw  = _llcpwt] , abs(stfip_educ#imonth imonth#iyear_educ educ_grp#iday sex#educ_grp educa _ageg5yr#educ_grp) cluster(stfip)
test _IeduXt_5_1 _IeduXt_5_2
xi: reghdfe menthlth i.educ_grp*any_5 [pw  = _llcpwt] , abs(stfip_educ#imonth imonth#iyear_educ educ_grp#iday sex#educ_grp educa _ageg5yr#educ_grp) cluster(stfip)
test _IeduXt_5_1 _IeduXt_5_2

*income (<35K vs 35k and above)
xi: reghdfe menthlth t_5 [pw  = _llcpwt] if income2<=4, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa income2) cluster(stfip)
xi: reghdfe menthlth any_5 [pw  = _llcpwt] if income2>4, abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa income2) cluster(stfip)

mark inc_group if income2>4
recode inc_group (min/max = .) if income2==.
gen stfip_inc = stfip*10 + inc_group
gen iyear_inc = iyear*10 + inc_group

xi: reghdfe menthlth i.inc_group*t_5 [pw  = _llcpwt] , abs(stfip_inc#imonth imonth#iyear_inc inc_group#iday inc_group#sex inc_group#_ageg5yr educa#inc_group) cluster(stfip)
xi: reghdfe menthlth i.inc_group*any_5 [pw  = _llcpwt] , abs(stfip_inc#imonth imonth#iyear_inc inc_group#iday inc_group#sex inc_group#_ageg5yr educa#inc_group) cluster(stfip)


*****GENERATE ESTIMATES FOR TABLE 2****
*Descriptive stats for police killings data
clear

use "$working/MPV_data_master.dta"
destring Victimsage, force replace

*Killings of unarmed black Americans
sum Victimsage if Victimsrace=="Black"&Unarmed=="Unarmed"
tab Victimsgender if Victimsrace=="Black"&Unarmed=="Unarmed"

*Killings of armed black Americans
sum Victimsage if Victimsrace=="Black"&Unarmed=="Allegedly Armed"
tab Victimsgender if Victimsrace=="Black"&Unarmed=="Allegedly Armed"

*Killings of unarmed white Americans
sum Victimsage if Victimsrace=="White"&Unarmed=="Unarmed"
tab Victimsgender if Victimsrace=="White"&Unarmed=="Unarmed"

*Killings of armed white Americans
sum Victimsage if Victimsrace=="White"&Unarmed=="Allegedly Armed"
tab Victimsgender if Victimsrace=="White"&Unarmed=="Allegedly Armed"

/*
*****ADDITIONAL NOT IN APPENDIX - SELECTION INTO THE SAMPLE ON THE BASIS OF RACE *****
*We found no differential change in the race of the BRFSS participant as a function of exposure to police killings
*Note: This is computationally expensive.

use "$working/black_unarmed_shootings.dta", clear

*histogram of shootings per month
expand count

bys stfip: egen total_shot = total(count)
gen shot_per_month = total_shot/45
hist shot_per_month

joinby stfip using "$working/brfss_working_full_June182018.dta", unmatched(master) _merge(_BRF)

*Generate time from shooting estimate for every shooting-individual pair in the data
*Combine these into 3 month intervals 
gen dist = int_date - shooting_date
gen timing = .

replace timing = 1 if dist>=0&dist<=90
replace timing = 2 if dist>90&dist<=180
replace timing = 3 if dist>180&dist<=270
replace timing = 4 if dist>270&dist<360


replace timing = -1 if dist<0&dist>=-90
replace timing = -2 if dist<=-91&dist>=-180
replace timing = -3 if dist<=-181&dist>=-270
replace timing = -4 if dist<=-271&dist>=-360

tab timing, gen(t_)  /*negative numbers reflect a shooting date AFTER interview; t_5 is the immediate post period */

*Denote whether shooting occured in any of +/- 6 monthly bins around interview date
gen monthdist = .
replace monthdist = 1 if dist>=0&dist<=30
replace monthdist = 2 if dist>30&dist<=60
replace monthdist = 3 if dist>60&dist<=90
replace monthdist = 4 if dist>90&dist<=120
replace monthdist = 5 if dist>120&dist<=150
replace monthdist = 6 if dist>150&dist<=180

replace monthdist = -1 if dist<0&dist>=-30
replace monthdist = -2 if dist<-30&dist>=-60
replace monthdist = -3 if dist<-60&dist>=-90
replace monthdist = -4 if dist<-90&dist>=-120
replace monthdist = -5 if dist<-120&dist>=-150
replace monthdist = -6 if dist<-150&dist>=-180

tab monthdist, gen(m_)	/*m_7 is the immediate post period (month) */

*Collapse data to individual level
*This will yield a dataset that contains counts of number of shootings in each quarterly or monthly bin around the interview data

collapse (sum) t_* m_* (mean) _psu _racegr shot_per_month genhlth physhlth menthlth _age educa stfip imonth iyear iday _ll sex int_dat drnk _rfsm exerany income2, by(id)

*Generate a measure of exposure to any police killing in the 4 quarters around BRFSS interview
forvalues x = 1/8 {
	gen any_`x' = t_`x'
	recode any_`x' (1/max = 1)
	}
	
gen black = _race
recode black (1=0) (2=1) (3/max = 0)

xi: reghdfe black t_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)
xi: reghdfe black any_5 [pw  = _llcpwt] , abs(stfip#imonth imonth#iyear iday sex _ageg5yr educa) cluster(stfip)

*/
